# install.packages("tidyverse")
# install.packages("gapminder")library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(gapminder)
library(broom) #tidy() glance()# gapminder
# View(gapminder)ggplot(gapminder, aes(x = gdpPercap, y=lifeExp)) +
geom_point()ggplot(gapminder %>% mutate(indicator = (country == "United Kingdom")), aes(x = gdpPercap, y = lifeExp)) +
geom_point(aes(colour = indicator))# we can see that the UK has been repeated for a bunch of different yearsSo, now that we know a country is repeated, we might want to visualise just the latest year for each country:
# piping is part of tidyverse
gapminder %>% nrow()## [1] 1704
# filter is part of tidyverse
gapminder %>%
filter(country == "Afghanistan")## # A tibble: 12 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## 11 Afghanistan Asia 2002 42.1 25268405 727.
## 12 Afghanistan Asia 2007 43.8 31889923 975.
# now we want to filter by year
gapminder %>%
filter(year == 2007) %>%
pull(lifeExp) %>%
mean()## [1] 67.00742
Lets combine plotitng and filtering
gapminder %>%
filter( year == 2007) %>%
ggplot(aes(x=gdpPercap, y=lifeExp)) +
geom_point()gapminder %>%
filter( year == 2007) %>%
ggplot(aes(x=gdpPercap, y=lifeExp)) +
geom_point() +
geom_smooth()## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Linear Model
gapminder %>%
filter( year == 2007) %>%
ggplot(aes(x=gdpPercap, y=lifeExp)) +
geom_point() +
geom_smooth(method = "lm")## `geom_smooth()` using formula 'y ~ x'
Saving variables and using functions
gm2007 <- gapminder %>%
filter(year == 2007)
lm(lifeExp ~ gdpPercap, data = gm2007)##
## Call:
## lm(formula = lifeExp ~ gdpPercap, data = gm2007)
##
## Coefficients:
## (Intercept) gdpPercap
## 5.957e+01 6.371e-04
We don’t want this output and instead we want to save this result and apply functions to it
model_lm <- lm(lifeExp ~ gdpPercap, data = gm2007)
summary(model_lm) # gives you more than just the estimated coefficients ie. intercept and slope##
## Call:
## lm(formula = lifeExp ~ gdpPercap, data = gm2007)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.828 -6.316 1.922 6.898 13.128
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.957e+01 1.010e+00 58.95 <2e-16 ***
## gdpPercap 6.371e-04 5.827e-05 10.93 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.899 on 140 degrees of freedom
## Multiple R-squared: 0.4606, Adjusted R-squared: 0.4567
## F-statistic: 119.5 on 1 and 140 DF, p-value: < 2.2e-16
residuals(model_lm) # residuals of predictors ie. difference between predicted and true values## 1 2 3 4 5 6
## -16.3585885 13.0746657 8.7702301 -19.8911299 7.6121709 -0.2705981
## 7 8 9 10 11 12
## -2.7540718 -2.9147296 3.6099346 -1.5913589 -3.7559419 3.5531359
## 13 14 15 16 17 18
## 10.5420588 -16.8463317 7.0482188 6.6342522 -8.0460633 -10.2596628
## 19 20 21 22 23 24
## -0.9345570 -10.4367387 -2.0528745 -15.2744773 -10.0003672 10.5952492
## 25 26 27 28 29 30
## 10.2357286 8.8592184 4.9580414 -13.2804878 -6.5580766 13.0711521
## 31 32 33 34 35 36
## -12.2218631 6.8679441 13.0062081 2.3724697 -3.7107349 -6.1014702
## 37 38 39 40 41 42
## 8.8303780 11.0491599 8.2163890 8.6626204 -15.7304355 -1.9342885
## 43 44 45 46 47 48
## -7.0587859 -1.4100171 1.6778622 -11.2449522 -0.5972526 -0.6564938
## 49 50 51 52 53 54
## -0.3895150 2.3716877 7.3891404 -4.1592473 -13.5466984 0.5847459
## 55 56 57 58 59 60
## 8.3715872 -2.6677900 2.2982367 -0.8606659 3.5699630 8.8284799
## 61 62 63 64 65 66
## 4.0039531 -2.8693162 -6.5967158 4.9175988 2.7776063 8.3369672
## 67 68 69 70 71 72
## 2.8681884 10.0898469 -6.3879361 6.7163535 4.1814531 -12.1185481
## 73 74 75 76 77 78
## 5.7622523 -17.9735247 -14.1517469 6.7041055 -0.7883088 -11.7464578
## 79 80 81 82 83 84
## 6.7419750 -5.7629144 3.4495006 6.2542769 8.9980281 5.2649277
## 85 86 87 88 89 90
## 9.0813768 9.1643859 -18.0084483 1.9018953 -9.7249409 3.5240074
## 91 92 93 94 95 96
## -3.2488695 4.5921209 11.5816637 -3.0934674 -13.9898238 -10.8168007
## 97 98 99 100 101 102
## 1.8559417 4.2570118 9.7215829 9.5276921 7.1348833 10.0895856
## 103 104 105 106 107 108
## 6.1919036 5.4649532 6.8653696 11.9894530 6.0239012 -13.8735532
## 109 110 111 112 113 114
## 4.9439324 -0.5856827 2.4052755 8.2010146 -17.5472042 -9.6301791
## 115 116 117 118 119 120
## 3.1967583 1.9425134 -11.9967262 -16.1326655 3.0124664 10.3008666
## 121 122 123 124 125 126
## -2.6677248 -22.8283427 -0.2548516 -1.7612700 11.9112315 0.5369554
## 127 128 129 130 131 132
## -7.7542648 6.2983510 -1.7082204 -1.2204860 9.8382065 6.8222933
## 133 134 135 136 137 138
## -8.6967059 -1.2955812 -8.6896144 10.0574246 6.9079504 13.1277383
## 139 140 141 142
## 11.9287963 1.6791936 -17.9915824 -16.3779179
predict(model_lm)## 1 2 3 4 5 6 7 8
## 60.18659 63.34833 63.53077 62.62213 67.70783 81.50560 82.58307 78.54973
## 9 10 11 12 13 14 15 16
## 60.45207 81.03236 60.48394 62.00086 64.30994 67.57433 65.34178 66.37075
## 17 18 19 20 21 22 23 24
## 60.34106 59.83966 60.65756 60.86674 82.70587 60.01548 60.65137 67.95775
## 25 26 27 28 29 30 31 32
## 62.72527 64.02978 60.19396 59.74249 61.88008 65.71085 60.54986 68.88006
## 33 34 35 36 37 38 39 40
## 65.26679 74.11353 82.04273 60.89247 63.40462 63.94484 63.12161 63.21538
## 41 42 43 44 45 46 47 48
## 67.30944 59.97429 60.00579 80.72302 78.97914 67.97995 60.04525 80.06249
## 49 50 51 52 53 54 55 56
## 60.41152 77.11131 62.86986 60.16625 59.93470 60.33125 61.82641 84.87579
## 57 58 59 60 61 62 63 64
## 71.03976 82.61767 61.12804 61.82152 66.96005 62.41432 85.48172 75.82740
## 65 66 67 68 69 70 71 72
## 77.76839 64.23003 79.73481 62.44515 60.49794 60.58065 74.44155 89.70655
## 73 74 75 76 77 78 79 80
## 66.23075 60.56552 59.82975 67.24789 60.23131 60.04946 67.49903 60.22991
## 81 82 83 84 85 86 87 88
## 60.71450 66.54672 67.19697 61.53807 65.46162 61.99961 60.09045 60.16710
## 89 90 91 92 93 94 95 96
## 62.63094 60.26099 83.01087 75.61188 61.31734 59.96047 60.84882 91.01280
## 97 98 99 100 101 102 103 104
## 73.78406 61.22599 65.81542 62.22431 64.28612 61.59841 69.37110 72.63305
## 105 106 107 108 109 110 111 112
## 71.88063 64.45255 66.45210 60.11555 60.58407 73.36268 60.65672 65.80099
## 113 114 115 116 117 118 119 120
## 60.11520 89.60218 71.46624 75.98349 60.15573 65.47167 77.92853 62.09513
## 121 122 123 124 125 126 127 128
## 61.22372 62.44134 81.13885 83.46227 62.23177 77.86304 60.27126 64.31765
## 129 130 131 132 133 134 135 136
## 60.12822 71.03949 64.08479 64.95471 60.23871 80.72058 86.93161 66.32658
## 137 138 139 140 141 142
## 66.83905 61.12126 61.49320 61.01881 60.37558 59.86492
tidy(model_lm)## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 59.6 1.01 59.0 9.89e-101
## 2 gdpPercap 0.000637 0.0000583 10.9 1.69e- 20
glance(model_lm) # model summary values and allows to compare several functions## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.461 0.457 8.90 120. 1.69e-20 1 -511. 1028. 1037.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
augment(model_lm) ## # A tibble: 142 × 8
## lifeExp gdpPercap .fitted .resid .hat .sigma .cooksd .std.resid
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 43.8 975. 60.2 -16.4 0.0120 8.82 0.0207 -1.85
## 2 76.4 5937. 63.3 13.1 0.00846 8.86 0.00928 1.48
## 3 72.3 6223. 63.5 8.77 0.00832 8.90 0.00411 0.990
## 4 42.7 4797. 62.6 -19.9 0.00907 8.77 0.0231 -2.25
## 5 75.3 12779. 67.7 7.61 0.00709 8.91 0.00263 0.858
## 6 81.2 34435. 81.5 -0.271 0.0292 8.93 0.0000144 -0.0309
## 7 79.8 36126. 82.6 -2.75 0.0327 8.93 0.00167 -0.315
## 8 75.6 29796. 78.5 -2.91 0.0211 8.93 0.00118 -0.331
## 9 64.1 1391. 60.5 3.61 0.0116 8.93 0.000975 0.408
## 10 79.4 33693. 81.0 -1.59 0.0278 8.93 0.000471 -0.181
## # … with 132 more rows
# gives us the original dataset plus predicted value from model, residuals, standard errors, etc
# ie. very useful for applying the results of the modelaugment(model_lm) %>%
ggplot(aes(gdpPercap, lifeExp)) +
geom_point() +
geom_line(aes(gdpPercap, .fitted))Adding more complexity, not by adding more predictors, but by allowing a more flexible functional relationship (ie. not linear) using the LOESS function - Locally estimated Scatterplot Smoothing
model_loess <- loess(lifeExp ~ gdpPercap, data = gm2007)
summary(model_loess)## Call:
## loess(formula = lifeExp ~ gdpPercap, data = gm2007)
##
## Number of Observations: 142
## Equivalent Number of Parameters: 4.76
## Residual Standard Error: 7.119
## Trace of smoother matrix: 5.2 (exact)
##
## Control settings:
## span : 0.75
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
## normalize: TRUE
## parametric: FALSE
## drop.square: FALSE
# tidy(model_loess) LOESS function doesnt have tidy()
# glance(model_loess) LOESS function doesnt have glance()augment(model_loess) %>%
ggplot(aes(gdpPercap, lifeExp)) +
geom_point() +
geom_line(aes(gdpPercap, .fitted))#@ SPAN - controls the degree of smoothening
LOESS will look at local data only. SPAN will tell us how local the data should be, changing is makes the data points in focus narrower or wider.
# changing span to 0.3 makes a narrower focus and less smoother curve
# closer span is to 1, the smoother the curve gets
model_loess <- loess(lifeExp ~ gdpPercap,
span = 0.3,
data = gm2007)
augment(model_loess) %>%
ggplot(aes(gdpPercap, lifeExp)) +
geom_point() +
geom_line(aes(gdpPercap, .fitted))# This is locally degree 1 - ie. locally linear
# again we increased span to make this curve smoother
model_loess <- loess(lifeExp ~ gdpPercap,
span = 0.9,
degree = 1,
data = gm2007)
augment(model_loess) %>%
ggplot(aes(gdpPercap, lifeExp)) +
geom_point() +
geom_line(aes(gdpPercap, .fitted))model_lm2 <- lm(lifeExp ~ gdpPercap + poly(gdpPercap,2), data = gm2007) # poly() creates polynomial of that degree
augment(model_lm2) %>%
ggplot(aes(gdpPercap, lifeExp)) +
geom_point() +
geom_line(aes(gdpPercap, .fitted))Now with LOESS model
# This is locally degree 1 - ie. locally linear
# again we increased span to make this curve smoother
model_loess <- loess(lifeExp ~ gdpPercap,
span = 0.5, # fit half of the dataset
degree = 2,
data = gm2007)
augment(model_loess) %>%
ggplot(aes(gdpPercap, lifeExp)) +
geom_point() +
geom_line(aes(gdpPercap, .fitted))Both lm and loess used degree 2 polynomial but give very different results. This is because loess gives local flexibility vs lm being a global model.
Getting to know the data
library(fivethirtyeight) # has candy ranking dataset## Some larger datasets need to be installed separately, like senators and
## house_district_forecast. To install these, we recommend you install the
## fivethirtyeightdata package by running:
## install.packages('fivethirtyeightdata', repos =
## 'https://fivethirtyeightdata.github.io/drat/', type = 'source')
?candy_rankings # info about this dataset
candy_rankings # view dataset## # A tibble: 85 × 13
## competitorname chocolate fruity caramel peanutyalmondy nougat
## <chr> <lgl> <lgl> <lgl> <lgl> <lgl>
## 1 100 Grand TRUE FALSE TRUE FALSE FALSE
## 2 3 Musketeers TRUE FALSE FALSE FALSE TRUE
## 3 One dime FALSE FALSE FALSE FALSE FALSE
## 4 One quarter FALSE FALSE FALSE FALSE FALSE
## 5 Air Heads FALSE TRUE FALSE FALSE FALSE
## 6 Almond Joy TRUE FALSE FALSE TRUE FALSE
## 7 Baby Ruth TRUE FALSE TRUE TRUE TRUE
## 8 Boston Baked Beans FALSE FALSE FALSE TRUE FALSE
## 9 Candy Corn FALSE FALSE FALSE FALSE FALSE
## 10 Caramel Apple Pops FALSE TRUE TRUE FALSE FALSE
## # … with 75 more rows, and 7 more variables: crispedricewafer <lgl>,
## # hard <lgl>, bar <lgl>, pluribus <lgl>, sugarpercent <dbl>,
## # pricepercent <dbl>, winpercent <dbl>
ggplot(candy_rankings, aes(x = pricepercent, y=winpercent)) +
geom_point()Linear Model
model_lm <- lm(winpercent ~ pricepercent, data = candy_rankings)
tidy(model_lm)## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 42.0 2.91 14.4 2.39e-24
## 2 pricepercent 17.8 5.30 3.35 1.21e- 3
augment(model_lm) %>%
ggplot(aes(x = pricepercent, y = winpercent))+
geom_point()+
geom_line(aes(y=.fitted))More Variables - categorical
model_lm <- lm(winpercent ~ pricepercent + chocolate, data = candy_rankings)
tidy(model_lm)## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 41.6 2.40 17.3 4.35e-29
## 2 pricepercent 1.66 5.08 0.328 7.44e- 1
## 3 chocolateTRUE 18.3 2.91 6.29 1.46e- 8
Notice, when you add categorical predictor to your linear model, you get the same simple regression line but you have different intercepts for the different levels of that categorical variable. Here, you can see an intercept for some baseline level of the categorical variable and then a shift in intercept for other levels relative to baseline level. Here, chocolate false is baseline group (41.57 intercept) and chocolate true is the other group (intercept of 41+18=59).
augment(model_lm) %>%
ggplot(aes(x = pricepercent, y = winpercent,
color = chocolate,
shape = chocolate,
linetype = chocolate))+
geom_point()+
geom_line(aes(y=.fitted))More variable - continuous
We we add continuous variables, we can no longer plot in 2D… Here, let’s try to plot in 3D
library(plotly)##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
candy <- candy_rankings
candy3d <- plot_ly(data = candy_rankings,
x = ~pricepercent,
y = ~sugarpercent,
z = ~winpercent,
type = "scatter3d")
candy3d## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
model_lm <- lm(winpercent ~ pricepercent + sugarpercent, data = candy_rankings)
xy_plane <- expand.grid(0:100, 0:100)/100
ps_plane <- xy_plane %>%
rename(pricepercent = Var1,
sugarpercent = Var2)
lm_plane <- augment(model_lm, newdata = ps_plane)
lm_matrix <- matrix(lm_plane$.fitted, nrow = 101, ncol = 101)
candy3d %>%
add_surface(
x = ~(0:100)/100,
y = ~(0:100)/100,
z = ~lm_matrix
)## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
tidy(model_lm)## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 39.8 3.44 11.6 6.33e-19
## 2 pricepercent 15.6 5.60 2.78 6.72e- 3
## 3 sugarpercent 6.73 5.66 1.19 2.38e- 1
More variables - both categorical and continuous
chocolate3d <- plot_ly(data=candy_rankings,
x = ~pricepercent,
y = ~sugarpercent,
z = ~winpercent,
color = ~chocolate,
colors = c("#2d708e","#d8576b"),
made = "markers",
symbol = ~chocolate,
symbols = c("o","circle"),
type = "scatter3d",
showlegend = FALSE)
chocolate3d## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: 'scatter3d' objects don't have these attributes: 'made'
## Valid attributes include:
## 'connectgaps', 'customdata', 'customdatasrc', 'error_x', 'error_y', 'error_z', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'line', 'marker', 'meta', 'metasrc', 'mode', 'name', 'opacity', 'projection', 'scene', 'showlegend', 'stream', 'surfaceaxis', 'surfacecolor', 'text', 'textfont', 'textposition', 'textpositionsrc', 'textsrc', 'texttemplate', 'texttemplatesrc', 'transforms', 'type', 'uid', 'uirevision', 'visible', 'x', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'ycalendar', 'yhoverformat', 'ysrc', 'z', 'zcalendar', 'zhoverformat', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
## Warning: 'scatter3d' objects don't have these attributes: 'made'
## Valid attributes include:
## 'connectgaps', 'customdata', 'customdatasrc', 'error_x', 'error_y', 'error_z', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'line', 'marker', 'meta', 'metasrc', 'mode', 'name', 'opacity', 'projection', 'scene', 'showlegend', 'stream', 'surfaceaxis', 'surfacecolor', 'text', 'textfont', 'textposition', 'textpositionsrc', 'textsrc', 'texttemplate', 'texttemplatesrc', 'transforms', 'type', 'uid', 'uirevision', 'visible', 'x', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'ycalendar', 'yhoverformat', 'ysrc', 'z', 'zcalendar', 'zhoverformat', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
candy <- candy_rankings
model_lm <- lm(winpercent ~ pricepercent + sugarpercent + chocolate, data = candy)
ps_plane <- ps_plane %>%
mutate(chocolate = TRUE)
lm_plane <- augment(model_lm, newdata = ps_plane)
lm_matrix_true <- matrix(lm_plane$.fitted, nrow = 101, ncol = 101)
ps_plane <- ps_plane %>%
mutate(chocolate=FALSE)
lm_plane <- augment(model_lm, newdata=ps_plane)
lm_matrix_false <- matrix(lm_plane$.fitted, nrow = 101, ncol=101)
chocolate3d %>%
add_surface(
x = ~(0:100)/100,
y = ~(0:100)/100,
z = ~lm_matrix_true,
showscale = FALSE,
inherit = FALSE,
colorscale = list(c(0,1),c("#f0f921","#7201a8"))) %>%
add_surface(
x = ~(0:100)/100,
y = ~(0:100)/100,
z = ~lm_matrix_false,
showscale = FALSE,
inherit = FALSE,
colorscale = list(c(0,1), c("#3cbb75","#481567")))## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: 'scatter3d' objects don't have these attributes: 'made'
## Valid attributes include:
## 'connectgaps', 'customdata', 'customdatasrc', 'error_x', 'error_y', 'error_z', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'line', 'marker', 'meta', 'metasrc', 'mode', 'name', 'opacity', 'projection', 'scene', 'showlegend', 'stream', 'surfaceaxis', 'surfacecolor', 'text', 'textfont', 'textposition', 'textpositionsrc', 'textsrc', 'texttemplate', 'texttemplatesrc', 'transforms', 'type', 'uid', 'uirevision', 'visible', 'x', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'ycalendar', 'yhoverformat', 'ysrc', 'z', 'zcalendar', 'zhoverformat', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
## Warning: 'scatter3d' objects don't have these attributes: 'made'
## Valid attributes include:
## 'connectgaps', 'customdata', 'customdatasrc', 'error_x', 'error_y', 'error_z', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'line', 'marker', 'meta', 'metasrc', 'mode', 'name', 'opacity', 'projection', 'scene', 'showlegend', 'stream', 'surfaceaxis', 'surfacecolor', 'text', 'textfont', 'textposition', 'textpositionsrc', 'textsrc', 'texttemplate', 'texttemplatesrc', 'transforms', 'type', 'uid', 'uirevision', 'visible', 'x', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'ycalendar', 'yhoverformat', 'ysrc', 'z', 'zcalendar', 'zhoverformat', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
tidy(model_lm)## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 38.6 2.81 13.8 6.50e-23
## 2 pricepercent -1.66 5.27 -0.315 7.54e- 1
## 3 sugarpercent 9.04 4.63 1.95 5.42e- 2
## 4 chocolateTRUE 18.7 2.87 6.53 5.40e- 9
model_lm_all <- lm(winpercent ~ ., candy_rankings %>%
select(-competitorname))
tidy(model_lm_all)## # A tibble: 12 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 34.5 4.32 7.99 1.44e-11
## 2 chocolateTRUE 19.7 3.90 5.07 2.96e- 6
## 3 fruityTRUE 9.42 3.76 2.50 1.45e- 2
## 4 caramelTRUE 2.22 3.66 0.608 5.45e- 1
## 5 peanutyalmondyTRUE 10.1 3.62 2.79 6.81e- 3
## 6 nougatTRUE 0.804 5.72 0.141 8.88e- 1
## 7 crispedricewaferTRUE 8.92 5.27 1.69 9.47e- 2
## 8 hardTRUE -6.17 3.46 -1.78 7.85e- 2
## 9 barTRUE 0.442 5.06 0.0872 9.31e- 1
## 10 pluribusTRUE -0.854 3.04 -0.281 7.79e- 1
## 11 sugarpercent 9.09 4.66 1.95 5.50e- 2
## 12 pricepercent -5.93 5.51 -1.08 2.86e- 1
Competitorname (ie. candy name) was a unique identifier and we want to avoid using that as a predictor otherwise we wont really be groupping anything.
rbind(glance(model_lm), glance(model_lm_all))## # A tibble: 2 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.433 0.412 11.3 20.6 5.20e-10 3 -325. 659. 671.
## 2 0.540 0.471 10.7 7.80 9.50e- 9 11 -316. 657. 689.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
We know that when we add more predictors, the R-squared should increase. Adjusted R-squared penalises R-squared using the ratio of predictors to sample size, so it can decrease when we add more predictors. Here, adjusted R-squared increased for model_lm_all which is an indicator that it was a model better at predicting the outcome.